Find Common Forecasts

if (file.exists(params$score_file)){
  load(params$score_file)
  if (exists(params$card_name)){
    score_cards = get(params$card_name)
  } else {
    stop(paste("score_card not in file, no object named", params$card_name))
  }
} else {
  stop(paste("File not found:", params$score_file))
}

score_cards_cmu = score_cards %>% 
                    filter(forecaster == "CMU-TimeSeries") %>% 
                    distinct(ahead, geo_value, target_end_date)
score_cards = semi_join(score_cards, score_cards_cmu)

recent_score_by_frcstr = score_cards %>% 
                           filter(target_end_date > today() - 28) %>%
                           group_by(forecaster, ahead) %>% 
                           summarize(num = sum(!is.na(wis))) %>%
                           pivot_wider(names_from = ahead, 
                                       names_prefix = "num_", 
                                       values_from = num)
recent_score_by_frcstr = recent_score_by_frcstr %>% 
                           mutate(num_total = num_1 + num_2 + num_3 + num_4) %>%
                           arrange(desc(num_total))
print(recent_score_by_frcstr, n = Inf)
## # A tibble: 13 x 6
## # Groups:   forecaster [13]
##    forecaster            num_1 num_2 num_3 num_4 num_total
##    <chr>                 <int> <int> <int> <int>     <int>
##  1 CMU-TimeSeries         4896  4896  4896  4896     19584
##  2 CovidAnalytics-DELPHI  4896  4896  4896  4896     19584
##  3 COVIDhub-baseline      4896  4896  4896  4896     19584
##  4 COVIDhub-ensemble      4896  4896  4896  4896     19584
##  5 JHU_IDD-CovidSP        4896  4896  4896  4896     19584
##  6 LANL-GrowthRate        4896  4896  4896  4896     19584
##  7 OliverWyman-Navigator  4896  4896  4896  4896     19584
##  8 UCLA-SuEIR             4896  4896  4896  4896     19584
##  9 UMass-MechBayes        4896  4896  4896  4896     19584
## 10 UT-Mobility            4896  4896  4896  4896     19584
## 11 MOBS-GLEAM_COVID       4800  4800  4800  4800     19200
## 12 GT-DeepCOVID           4728  4584  4464  4392     18168
## 13 IHME-CurveFit          2448  2448  1224  1224      7344
cmu_forecasts = recent_score_by_frcstr[recent_score_by_frcstr$forecaster == "CMU-TimeSeries",]$num_total

active_forecasters = recent_score_by_frcstr[recent_score_by_frcstr$num_total > cmu_forecasts * 0.80,]$forecaster

score_cards = score_cards %>% 
                filter(forecaster %in% active_forecasters)

# save for pairwise tournament
score_cards_tourney = score_cards
# TODO: Use evalcast::intersect_averagers after merging the kill-cards updates
forecaster_coverage = score_cards %>% 
                        filter(quantile == 0.5) %>% 
                        group_by(ahead, geo_value, target_end_date) %>% 
                        summarise(num_forecasts = n())
max_forecasts = max(forecaster_coverage$num_forecasts)
table(forecaster_coverage$num_forecasts)
## 
##    5    9   10   11   12 
##  204   68  676 1283 1907
common_forecasts = forecaster_coverage %>% 
                            filter(num_forecasts == max_forecasts) %>%
                            select(ahead, geo_value, target_end_date)


score_cards = semi_join(score_cards, common_forecasts)

Dot plots

First we make dot plots (not the same as that in evalcast): one dot per ahead, forecaster, and forecast date. The red plus marks the score computed over all dates. Here we use the mean as the aggregator function, and we study WIS, AE, and coverage-80.

# Define mean and median functions that deal with missingness well
Mean = function(x) mean(x, na.rm = TRUE)
Median = function(x) median(x, na.rm = TRUE)

summarize_var = function(df, var, aggr = Mean) {
  df_by_date = df %>% 
    group_by(forecaster, ahead, target_end_date) %>%
    summarize(var = aggr(!!as.symbol(var))) %>%
    ungroup()
  df_overall = df %>%
    group_by(forecaster, ahead) %>%
    summarize(var_overall = aggr(!!as.symbol(var))) %>%
    ungroup() %>% group_by(ahead) %>%
    arrange(var_overall, .by_group = TRUE) %>%
    ungroup() %>%
    mutate(order = row_number())
  df_sum = full_join(df_by_date, df_overall, by = c("forecaster", "ahead"))
}

dot_plot = function(df, var = "wis", ylab = var, ylim = NULL, aggr = Mean) {
  df_sum = summarize_var(df, var, aggr)
  df_sum$ahead = factor(paste("ahead =", df_sum$ahead))
  
  ggplot(df_sum, aes(x = order, y = var)) +
    geom_point(aes(color = target_end_date)) + 
    geom_point(aes(x = order, y = var_overall), color = "red", shape = 3) +
    facet_wrap(vars(ahead), scales = "free") + 
    labs(x = "Forecaster", y = ylab) +
    scale_x_continuous(breaks = df_sum$order, labels = df_sum$forecaster) + 
    theme(axis.text.x = element_text(angle = 90, hjust = 1, size = 8)) +
    coord_cartesian(ylim = ylim)
}
dot_plot(score_cards, var = "wis", ylab = "Mean WIS") + scale_y_log10()

dot_plot(score_cards, var = "ae", ylab = "Mean AE") + scale_y_log10()

dot_plot(score_cards, var = "cov_80", ylab = "Coverage-80", ylim = c(0,1)) +
  geom_hline(yintercept = 0.8)

Dot plots: median and 90th percentile

Same as before, but change the aggregator function to the median. Omitting AE for brevity, hencforth.

dot_plot(score_cards, var = "wis", ylab = "Median WIS", aggr = Median) +
  scale_y_log10()

And now we change the aggregator function to be the 90th percentile.

dot_plot(score_cards, var = "wis", ylab = "90th percentile WIS", 
         aggr = function(x) quantile(x, prob = 0.9, na.rm = TRUE)) +
  scale_y_log10()

Line plots

Now we make line plots: one line per ahead and forecaster, as a function of forecast date. Here we use the mean as the aggregator function, and we look at WIS and coverage-80.

# From https://stackoverflow.com/questions/15282580/
color_picker = function(n) {
  qual_col_pals = brewer.pal.info[brewer.pal.info$category == 'qual',]
  unlist(mapply(brewer.pal, qual_col_pals$maxcolors, rownames(qual_col_pals)))
}

line_plot = function(df, var = "wis", ylab = var, ylim = NULL, aggr = Mean) {
  df_sum = summarize_var(df, var, aggr)
  df_sum$ahead = factor(paste("ahead =", df_sum$ahead))
  
  ggplot(df_sum, aes(x = target_end_date, y = var)) +
    geom_line(aes(color = forecaster, linetype = forecaster)) +
    geom_point(aes(color = forecaster)) +
    facet_wrap(vars(ahead), scales = "free") + 
    labs(x = "Date", y = ylab) +
    coord_cartesian(ylim = ylim) +
    scale_color_manual(values = color_picker(length(unique(forecaster))))
}
line_plot(score_cards, var = "wis", ylab = "Mean WIS") + scale_y_log10() 

line_plot(score_cards, var = "cov_80", ylab = "Coverage-80", ylim = c(0,1)) +
  geom_hline(yintercept = 0.8)

Scaling by baseline

We scale each score, per location and forecast date, by the COVIDhub-baseline score; then we take the mean or median.

Important note on the order of operations here: scale then aggregate. The other way around: aggregate then scale, would be a simple post-adjustment applied to the metrics we computed earlier. This way: scale then aggregate, results in a different final metric altogether. It is potentially interesting as it provides a nonparametric spatiotemporal adjustment; assuming that space and time effects are multiplicative, we’re directly “canceling them out” by taking ratios.

Here are dot plots for scaled WIS, with mean as the aggregator. Omitting median for brevity.

# Note to self: mutate_at() gave me a weird bug below! From now on, better use
# mutate() with across() instead ...
scale_df = function(df, var, base_forecaster = "COVIDhub-baseline") {
  df %>% select(-c(quantile, value, forecast_date)) %>%
    distinct() %>%
    pivot_wider(id_cols = c(geo_value, target_end_date, ahead),
                names_from = "forecaster", names_prefix = var, 
                values_from = var) %>%
    mutate(across(starts_with(var), ~ .x /
                !!as.symbol(paste0(var, base_forecaster)))) %>%
    pivot_longer(cols = starts_with(var), names_to = "forecaster",
                 values_to = "scaled") %>%
    mutate(forecaster = substring(forecaster, nchar(var) + 1)) %>%
    filter(forecaster != base_forecaster)
}
dot_plot(scale_df(score_cards, var = "wis"), var = "scaled", 
         ylab = "Mean scaled WIS") + geom_hline(yintercept = 1) 

Here are now line plots for mean scaled WIS.

line_plot(scale_df(score_cards, var = "wis"), var = "scaled", 
          ylab = "Mean scaled WIS") + geom_hline(yintercept = 1) 

Centering by baseline

Similar to what we did previously but just with centering instead of scaling.

Note on order of operations: center then aggregate versus aggregate then center are still in general different strategies. As before we’re adhering to the first way, with a similar movitation: if space and time effects were now additive, then this way would “cancel them out” directly. However, when the aggregator is a linear operation (e.g., mean), the two strategies essentially reduce to the same thing (“essentially”, not exactl, because setting na.rm = TRUE generally turns any linear operator into a nonlinear one).

Here are the dot plots for mean centered WIS. Omitting median for brevity.

center_df = function(df, var, base_forecaster = "COVIDhub-baseline") {
  scale_df(df %>% mutate(y = exp(!!as.symbol(var))), "y", base_forecaster) %>%
    mutate(centered = log(scaled)) %>% select(-scaled)
}
dot_plot(center_df(score_cards, var = "wis"), var = "centered", 
         ylab = "Mean centered WIS") + geom_hline(yintercept = 0)  

Here are now the line plots for mean centered WIS.

line_plot(center_df(score_cards, var = "wis"), var = "centered", 
          ylab = "Mean centered WIS") + geom_hline(yintercept = 0) 

Pairwise tournament

We run a pairwise tournament. This is inspired by Johannes Bracher’s analysis (and similar ideas in the literature). Except, the order of operations here is different: scale then aggregate (whereas Johannes did: aggregate then scale). The motivation for this was explained above (thinking of it as providing a nonparametric spatiotemporal adjustment), as was the fact that the order of operations really makes a difference.

For each pair of forecasters \(f\) and \(g\), we compute:

\[ \theta_{fg} = A\bigg\{ \frac{S(f;\ell,d,a)}{S(g;\ell,d,a)} \;:\; \text{common locations $\ell$, forecast dates $d$, and ahead values $a$} \bigg\} \]

where \(S\) is a score of interest, say WIS, and \(A\) is an aggregator of interest, say the mean. Important note: we aggregate over all common locations, dates, and ahead values, which may differ for each pair \(f,g\). To compute an overall metric for forecaster \(f\), we use:

\[ \theta_f = \bigg( \prod_g \theta_{fg} \bigg)^{1/F}. \]

the geometric mean of all pairwise comparisons of \(f\) to other forecasters (here \(F\) is the total number of forecasters). Another interesting option would be to define \((\theta_f)_{f \in F}\) as the top left singular vector of the matrix \((\theta_{fg})_{f,g \in F}\), which we’ll also investigate.

pairwise_tournament = function(df, var, aggr = Mean) {
  forecasters = unique(df$forecaster)
  theta_mat = matrix(NA, length(forecasters), length(forecasters))
  rownames(theta_mat) = colnames(theta_mat) = forecasters
  for (f in forecasters) {
    result =  scale_df(df, var, base_forecaster = f) %>% 
      group_by(forecaster) %>%
      summarize(v = aggr(scaled))
    theta_mat[result$forecaster, f] = result$v
  }
  
  # Convert to data frame for convenience with ggplot
  theta_df = as.data.frame(theta_mat) %>%
    mutate(Forecaster1 = forecasters) %>%
    pivot_longer(cols = -Forecaster1, names_to = "Forecaster2",
                 values_to = "value")
  
  # Compute overall metrics two ways: geometric mean, SVD
  theta_vec1 = exp(rowMeans(log(theta_mat), na.rm = TRUE))
  diag(theta_mat) = 1 # so the SVD won't fail; undo it later
  theta_vec2 = as.numeric(svd(theta_mat, nu = 1)$u)
  names(theta_vec2) = names(theta_vec1)
  diag(theta_mat) = NA
  
  return(list(mat = theta_mat, df = theta_df, vec1 = theta_vec1, 
              vec2 = theta_vec2))
}
theta = pairwise_tournament(score_cards_tourney, var = "wis", aggr = 
                              function(x) mean(x, trim = 0.01, na.rm = TRUE))
ranked_list = rownames(theta$mat)[order(theta$vec1)]
colors = colorRampPalette(brewer.pal(n = 6, name = "RdBu"))(30)
ggplot(theta$df, aes(x = factor(Forecaster2, levels = rev(ranked_list)),
                     y = factor(Forecaster1, levels = rev(ranked_list)))) +
  geom_tile(aes(fill = value)) +
  geom_text(aes(label = round(value, 3))) +
  scale_fill_gradientn(colours = colors) +
  labs(x = NULL, y = NULL) +
  theme_bw() + theme(legend.position = "none", 
                     axis.text.x = element_text(angle = 90, hjust = 1))

# Overall metric (computed via GM of pairwise metrics):
knitr::kable(data.frame(rank = 1:length(theta$vec1), forecaster = ranked_list,
                        theta = sort(theta$vec1), row.names = NULL))
rank forecaster theta
1 COVIDhub-ensemble 0.9569131
2 OliverWyman-Navigator 1.1360359
3 UMass-MechBayes 1.1485300
4 CMU-TimeSeries 1.1804353
5 GT-DeepCOVID 1.3146316
6 LANL-GrowthRate 1.4739125
7 COVIDhub-baseline 1.5388537
8 MOBS-GLEAM_COVID 1.6221571
9 CovidAnalytics-DELPHI 1.6489339
10 UCLA-SuEIR 1.8982002
11 JHU_IDD-CovidSP 2.1590448
12 UT-Mobility 2.5742527

For curiosity, we can plot the agreement the overall metric computed via GM and SVD. The agreement is basically perfect!

ggplot() + geom_point(aes(x = theta$vec1, y = theta$vec2)) +
  labs(x = "Geometric mean", y = "Top left singular vector")

ggplot() + geom_point(aes(x = theta$vec1, y = theta$vec2)) +
  labs(x = "Geometric mean", y = "Top left singular vector")

Pairwise tournament: median and 90th percentile

Repeat the same pairwise tournament but with the median as the aggregator.

theta = pairwise_tournament(score_cards_tourney, var = "wis", aggr = Median)

ranked_list = rownames(theta$mat)[order(theta$vec1)]
colors = colorRampPalette(brewer.pal(n = 6, name = "RdBu"))(30)
ggplot(theta$df, aes(x = factor(Forecaster2, levels = rev(ranked_list)),
                     y = factor(Forecaster1, levels = rev(ranked_list)))) +
  geom_tile(aes(fill = value)) +
  geom_text(aes(label = round(value, 3))) +
  scale_fill_gradientn(colours = colors) +
  labs(x = NULL, y = NULL) +
  theme_bw() + theme(legend.position = "none", 
                     axis.text.x = element_text(angle = 90, hjust = 1))

# Overall metric (computed via GM of pairwise metrics):
knitr::kable(data.frame(rank = 1:length(theta$vec1), forecaster = ranked_list,
                        theta = sort(theta$vec1), row.names = NULL))
rank forecaster theta
1 COVIDhub-ensemble 0.7363054
2 OliverWyman-Navigator 0.7618537
3 UMass-MechBayes 0.7639400
4 CMU-TimeSeries 0.8509865
5 GT-DeepCOVID 0.8867668
6 MOBS-GLEAM_COVID 1.0286859
7 COVIDhub-baseline 1.0589439
8 LANL-GrowthRate 1.0709268
9 CovidAnalytics-DELPHI 1.1460644
10 UT-Mobility 1.2653671
11 JHU_IDD-CovidSP 1.2995943
12 UCLA-SuEIR 1.4064702

And now with the 90th percentile as the aggregator.

theta = pairwise_tournament(score_cards_tourney, var = "wis", aggr =
                              function(x) quantile(x, prob = 0.9, na.rm = TRUE))

ranked_list = rownames(theta$mat)[order(theta$vec1)]
colors = colorRampPalette(brewer.pal(n = 6, name = "RdBu"))(30)
ggplot(theta$df, aes(x = factor(Forecaster2, levels = rev(ranked_list)),
                     y = factor(Forecaster1, levels = rev(ranked_list)))) +
  geom_tile(aes(fill = value)) +
  geom_text(aes(label = round(value, 3))) +
  scale_fill_gradientn(colours = colors) +
  labs(x = NULL, y = NULL) +
  theme_bw() + theme(legend.position = "none", 
                     axis.text.x = element_text(angle = 90, hjust = 1))

# Overall metric (computed via GM of pairwise metrics):
knitr::kable(data.frame(rank = 1:length(theta$vec1), forecaster = ranked_list,
                        theta = sort(theta$vec1), row.names = NULL))
rank forecaster theta
1 COVIDhub-ensemble 1.904414
2 UMass-MechBayes 2.506068
3 OliverWyman-Navigator 2.538642
4 CMU-TimeSeries 2.552495
5 GT-DeepCOVID 2.898721
6 LANL-GrowthRate 3.171466
7 COVIDhub-baseline 3.340881
8 CovidAnalytics-DELPHI 3.663427
9 MOBS-GLEAM_COVID 3.724192
10 UCLA-SuEIR 4.243885
11 JHU_IDD-CovidSP 5.131937
12 UT-Mobility 6.633000

Interval Error (experimental)

Probabilistic forecasters may be judged on both their point estimates and confidence intervals. In the previous tabs, we have primarily focused on coverage-80 (which evaluates 80% confidence intervals) and WIS (which evaluates a combination of intervals). We briefly touched on Average Error as a measure of point estimates, however this does not account for spatial and temporal differences in forecasting difficulty.

Interval Error is an experimental attempt to evaluate point estimates while considering that forecasting difficulty is not uniform.

The essential idea is to use the COVIDhub-ensemble 80% confidence interval as a proxy for forecasting difficulty. As most forecasters use a long tailed distribution for COVID cases and deaths, we presume a multiplicative uncertainty model. In particular,

\[ D_{\ell,d,a} = log(P^{*90}_{\ell,d,a} / P^{*10}_{\ell,d,a}) \] Where \(D_{\ell, d, a}\) is the normalization score at a location, date, ahead combo and \(P^{*90}_{\ell,d,a}\) is the ensemble forecaster’s distribution for the same combo.

For each forecaster \(f\) we take \[s_{\ell,d,a} = log((P^{f50} + 1) / (A_{\ell,d} + 1)) / D_{\ell,d,a}\] where \(A_{\ell,d}\) is the actual value observed for that date and location. The rationale here is similar to if one were making predictions on normally distributed variables with different variances and chose to use standard deviation as a normalization metric, but with multiplicative error instead. Or to put it another way, if your forecast is off by a factor of 5, you’ll be dinged more if the ensemble indicated that the result would be within a factor of 2 than if it indicated a factor of 10. Because the normalization is based on the ensemble interval, it is provisionally referred to as “Interval Error” or IE.

ensemble_widths = score_cards %>%
                    filter(forecaster == "COVIDhub-ensemble") %>%
                    filter(quantile %in% c(0.1, 0.9)) %>%
                    arrange(quantile) %>%
                    group_by(geo_value, ahead, target_end_date) %>%
                    mutate(width = value / lag(value)) %>%
                    filter(!is.na(width)) %>%
                    filter(!is.infinite(width)) %>%
                    mutate(log_width = log(width)) %>%
                    select(ahead, geo_value, target_end_date, width, log_width)

score_cards_ie = score_cards %>% filter(quantile == 0.5)
score_cards_ie = score_cards_ie %>% mutate(actual = actual + 1) %>% mutate(value = value + 1)

score_cards_ie = inner_join(score_cards_ie, ensemble_widths) %>%
        mutate(ie = abs(log(value/actual)/log_width))

ie_summary = score_cards_ie %>%
               select(ahead, forecaster, target_end_date, ie) %>%
               group_by(forecaster, ahead, target_end_date) %>% 
               summarise_at(vars(ie), mean)
dot_plot(ie_summary, var = "ie", ylab = "IE")

line_plot(ie_summary, var = "ie", ylab = "IE")

We may still compare to the baseline forecaster to compare the results to an unskilled forecaster.

baseline_ie = ie_summary %>%
                    filter(forecaster == "COVIDhub-baseline") %>%
                    rename(baseline_ie = ie) %>%
                    ungroup() %>%
                    select(ahead, target_end_date, baseline_ie)

ie_summary = inner_join(ie_summary, baseline_ie, by = c("ahead", "target_end_date")) %>% 
                        mutate(difference = ie - baseline_ie)

dot_plot(ie_summary, var = "difference", ylab = "IE Difference")

line_plot(ie_summary, var = "difference", ylab = "IE Difference")

and the same graphs with just CMU, the baseline and Ensemble for visibility:

selected_forecasters = c("CMU-TimeSeries", "COVIDhub-baseline", "COVIDhub-ensemble")
ie_summary_small = ie_summary %>% filter(forecaster %in% selected_forecasters)

dot_plot(ie_summary_small, var = "ie", ylab = "IE")

line_plot(ie_summary_small, var = "ie", ylab = "IE")

dot_plot(ie_summary_small, var = "difference", ylab = "IE Difference")

line_plot(ie_summary_small, var = "difference", ylab = "IE Difference")